Please specify below the focus year of this analysis. It will be consider as the end point of historical analyses, as well as the year for annual analyses.
focus_year = "2017"
Please select the focus language using one of the following values:
focus_language = 'en'
language_ref = { 'en' : { 'name' : 'English', 'min_coocurrence' : 10},
'de' : { 'name' : 'German', 'min_coocurrence' : 2},
'es' : { 'name' : 'Spanish', 'min_coocurrence' : 2},
'fr' : { 'name' : 'French', 'min_coocurrence' : 2},
'pt' : { 'name' : 'Portuguese', 'min_coocurrence' : 2},
}
The UN Global Compact website contains entries for each COP report, describing the sector of the company submitting the report, country and year, as well as the language in which the repoort was written in and a link to a PDF file with the full report.
The results in this section give a general view of the available COPs, it's not yet restricted by the focus_year and focus_language.
import requests
import re
from bs4 import BeautifulSoup
gc_url = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page=1&per_page=10"
gc_base_url = "https://www.unglobalcompact.org"
gc_home = requests.get(gc_url)
soup = BeautifulSoup(gc_home.content, 'lxml')
header = soup.h2.string
total_num_cops = re.search(r'(?<=: )[0-9]+', header)[0]
print("Total number of COPs available: %s" % total_num_cops)
full_gc_url = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page=1&per_page=" + total_num_cops
print("Getting full list of reports ...")
gc_full_list = requests.get(full_gc_url)
gc_full_list_soup = BeautifulSoup(gc_full_list.content, 'lxml')
def check_sdgs_3_13(profile): #checks in SDGs 3 and 13 are selected
has_sdg3 = "no"
has_sdg13 = "no"
questions = profile.find_all("ul", class_='questionnaire')
if len(questions) == 2:
sdgs = questions[0].find_all("li")
if len(sdgs) != 18: # the correct SDG questionnaire has 17 questions + header
temp_sdgs = questions[1].find_all("li")
if len(temp_sdgs) == 18:
sdgs = temp_sdgs
else:
sdgs = []
if 'selected_question' in sdgs[3].get('class'):
has_sdg3 = "yes"
if 'selected_question' in sdgs[13].get('class'):
has_sdg13 = "yes"
return (has_sdg3, has_sdg13)
participants = gc_full_list_soup.tbody.find_all("tr")
pdfs = {}
num_pdfs = 0
num_nonpdfs = 0
num_noreport = 0
langregex = re.compile(r'(?<=\()[^\)\(]+(?=\)$)')
print("Getting details of each report ...")
for participant in participants:
cells = participant.find_all('td')
sector = cells[1].get_text(strip=True)
country = cells[2].get_text(strip=True)
year = cells[3].get_text(strip=True)
participant_entry_url = gc_base_url + cells[0].a.get('href')
participant_profile = requests.get(participant_entry_url)
participant_profile_soup = BeautifulSoup(participant_profile.content, 'lxml')
(participant_sdgs_3, participant_sdgs_13) = check_sdgs_3_13(participant_profile_soup)
main_body = participant_profile_soup.find("section", class_='main-content-body')
list_items = main_body.find_all("li")
found_report = False
for li in list_items:
if li.a:
link = li.a.get('href')
if "/system/attachments/" in link:
if ".pdf" in link:
num_pdfs += 1
language = langregex.search(li.get_text(strip=True))[0]
pdfs[link] = { "sector" : sector, "country" : country, "year" : year, "language" : language, "sdgs3" : participant_sdgs_3, "sdgs13" : participant_sdgs_13}
print(".", end='')
else:
num_nonpdfs += 1
found_report = True
if not found_report:
num_noreport += 1
print(" done.")
print("PDFs: %d, non-PDFs: %d, no-report: %d" % (num_pdfs, num_nonpdfs, num_noreport))
Saving index of reports so that it can be reused
import pandas as pd
reports_index_csv_filename = "../data/cops/reports_index.csv"
df_pdfs = pd.DataFrame.from_dict(pdfs, orient='index')
df_pdfs.to_csv(reports_index_csv_filename, sep='\t', encoding='utf-8')
Possible starting point: This can be used when an index file is available (has been saved previously). Only run this cell if starting from this point, otherwise skip it.
import pandas as pd
reports_index_csv_filename = "../data/cops/reports_index.csv"
df_pdfs = pd.read_csv(reports_index_csv_filename, sep='\t', encoding='utf-8', index_col=0, dtype={'year': object})
pdfs = df_pdfs.to_dict(orient='index')
countries = {}
sectors = {}
years = {}
languages = {}
sdgs3 = 0
sdgs13 = 0
sdgs3_13 = 0
for pdf in pdfs.keys():
language = pdfs[pdf]["language"]
year = pdfs[pdf]["year"]
country = pdfs[pdf]["country"]
sector = pdfs[pdf]["sector"]
sdg3 = pdfs[pdf]["sdgs3"]
sdg13 = pdfs[pdf]["sdgs13"]
sectors[sector] = sectors.get(sector,0) + 1
countries[country] = countries.get(country,0) + 1
years[year] = years.get(year,0) + 1
languages[language] = languages.get(language,0) + 1
if sdg3 == "yes":
sdgs3 += 1
if sdg13 == "yes":
sdgs13 += 1
if sdg3 == "yes" and sdg13 == "yes":
sdgs3_13 += 1
print("Number of reports that include SDG 3 or SDG 13 or both: %d, %d, %d respectively" % (sdgs3, sdgs13, sdgs3_13))
df_languages = pd.DataFrame(sorted(languages.items(), key=lambda k: k[1], reverse=True), columns=["Language", "Number of reports"])
df_languages
df_countries = pd.DataFrame(sorted(countries.items(), key=lambda k: k[1], reverse=True), columns=["Country", "Number of reports"])
df_countries
df_sectors = pd.DataFrame(sorted(sectors.items(), key=lambda k: k[1], reverse=True), columns=["Sector", "Number of reports"])
df_sectors
df_years = pd.DataFrame(sorted(years.items(), reverse=True), columns=["Year", "Number of reports"])
df_years
selected_sectors = {}
selected_countries = {}
selected_years = {}
selected_countries_years = {}
selected_pdfs = {}
for pdf in pdfs.keys():
language = pdfs[pdf]["language"]
year = pdfs[pdf]["year"]
country = pdfs[pdf]["country"]
sector = pdfs[pdf]["sector"]
if language == language_ref[focus_language]['name'] and int(year) <= int(focus_year):
selected_pdfs[pdf] = pdfs[pdf]
selected_sectors[sector] = selected_sectors.get(sector,0) + 1
selected_countries[country] = selected_countries.get(country,0) + 1
selected_years[year] = selected_years.get(year,0) + 1
if country in selected_countries_years.keys():
selected_countries_years[country][year] = selected_countries_years[country].get(year,0) + 1
else:
selected_countries_years[country] = {year : 1}
print("There are %d reports up to %s written in %s" % (len(selected_pdfs.keys()), focus_year, language_ref[focus_language]['name']))
df_selected_countries = pd.DataFrame(sorted(selected_countries.items(), key=lambda k: k[1], reverse=True), columns=["Country", "Number of reports"])
df_selected_countries
df_selected_sectors = pd.DataFrame(sorted(selected_sectors.items(), key=lambda k: k[1], reverse=True), columns=["Sector", "Number of reports"])
df_selected_sectors
df_selected_years = pd.DataFrame(sorted(selected_years.items(), reverse=True), columns=["Year", "Number of reports"])
df_selected_years
At this time we've only considered reports written in the focus language and submitted up to end of the focus year.
A folder should be specified as the location where PDFs will be downloaded to ('pdfs_folder' variable below).
If this process has been run before and some files are already available in the specified folder, they won't be downloaded again.
pdfs_folder = "../data/cops/pdfs/"
filenameregex = re.compile(r'(?<=/)[^$/]+(?=$)')
import PyPDF2
import shutil
import nltk
import os
#import os.path
try:
os.stat(pdfs_folder)
except:
os.mkdir(pdfs_folder)
for pdf in selected_pdfs.keys():
filename = pdfs_folder + filenameregex.search(pdf)[0]
if not os.path.isfile(filename):
print("Saving %s" % (filename))
file = requests.get(gc_base_url + pdf, stream=True)
try:
with open(filename, 'wb') as out_file:
shutil.copyfileobj(file.raw, out_file)
del file
except:
print("Could not save %s" % (filename))
continue
else:
print("Skipping %s, PDF already available in folder" % (filename))
A folder should be specified as the location where text files will be saved at ('txts_folder' variable below).
This process may fail to extract the text from some PDF files.
If this process has been run before and some text files are already available in the specified folder, they won't be processed again.
txts_folder = "../data/cops/txts/"
try:
os.stat(txts_folder)
except:
os.mkdir(txts_folder)
for pdf in selected_pdfs.keys():
filename = pdfs_folder + filenameregex.search(pdf)[0]
filenametxt = txts_folder + filenameregex.search(pdf)[0] + ".txt"
if not os.path.isfile(filenametxt):
print("Loading %s" % (filename))
try:
pdfFileObj = open(filename, 'rb')
txtFileObj = open(filenametxt, 'w')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
except:
print("Couldn't load %s" % (filename))
continue
print("Extracting text from %s" % (filename))
for num_page in range(0,num_pages):
try:
pageObj = pdfReader.getPage(num_page)
txtFileObj.write(pageObj.extractText())
except:
print("Couldn't extract txt %s, page %d" % (filename, num_page))
continue
pdfFileObj.close()
txtFileObj.close()
else:
print("Skipping %s, TXT already available in folder" % (filename))
import json
import os
import re
keywords_file = os.path.join("..", "keywords", focus_language + ".json")
climate_dict = []
health_dict = []
compound_terms = []
def normalise_keywords(dictionary): #lowercases and handles compounds
for i in range(0, len(dictionary)):
keyword = dictionary[i].lower()
compound = keyword.replace(' ','_')
if compound != keyword:
keyword = compound
words = tuple(compound.split('_'))
compound_terms.append(words)
dictionary[i] = keyword
return dictionary
def generate_hashtags(dictionary):
hashtags_dict = []
for keyword in dictionary:
hashtags_dict.append("#" + keyword.replace('_',''))
return hashtags_dict
with open(keywords_file) as f:
data = json.load(f)
climate_dict = normalise_keywords(data['climate'])
health_dict = normalise_keywords(data['health'])
climate_hashtag_dict = generate_hashtags(climate_dict)
health_hashtag_dict = generate_hashtags(health_dict)
health_dict
climate_dict
who_regions = {}
who_regions["Africa"] = ["Algeria", "Angola", "Benin", "Botswana", "British Indian Ocean Territory",
"Burkina Faso", "Burundi", "Cabo Verde", "Cameroon", "Central African Republic",
"Chad", "Comoros", "Congo, Democratic Republic of the", "Cote d'Ivoire",
"Democratic Republic of the Congo", "Congo, Republic of the",
"Equatorial Guinea", "Eritrea", "Ethiopia", "French Southern Territories", "Gabon",
"Gambia", "Ghana", "Guinea", "Guinea-Bissau", "Kenya", "Lesotho", "Liberia",
"Madagascar", "Malawi", "Mali", "Mauritania", "Mauritius", "Mayotte", "Mozambique",
"Namibia", "Niger", "Nigeria", "Rwanda", "Réunion", "Saint Helena",
"Sao Tome And Principe", "Senegal", "Seychelles", "Sierra Leone", "South Africa",
"South Sudan", "Swaziland", "Togo", "Uganda", "Tanzania, United Republic of",
"Western Sahara", "Zambia", "Zimbabwe"]
who_regions["Eastern Mediterranean"] = ["Afghanistan", "Bahrain", "Djibouti", "Egypt",
"Iran, Islamic Republic of", "Iraq", "Jordan", "Kuwait", "Lebanon",
"Libya", "Morocco", "Oman", "Pakistan", "Qatar", "Saudi Arabia",
"Somalia", "Palestine, State of", "Sudan", "Syrian Arab Republic",
"Tunisia", "United Arab Emirates", "Yemen"]
who_regions["Europe"] = ["Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus", "Belgium",
"Bosnia-Herzegovina", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark",
"Estonia", "Faroe Islands", "Finland", "France", "Georgia", "Germany", "Gibraltar",
"Greece", "Guernsey", "Holy See", "Hungary", "Iceland", "Ireland", "Isle of Man",
"Israel", "Italy", "Jersey", "Kazakhstan", "Kyrgyzstan", "Latvia", "Liechtenstein",
"Lithuania", "Luxembourg", "Malta", "Monaco", "Montenegro", "Netherlands", "Norway",
"Poland", "Portugal", "Moldova, Republic of", "Romania", "Russian Federation",
"San Marino", "Sark", "Serbia", "Slovakia", "Slovenia", "Spain",
"Svalbard and Jan Mayen Islands", "Sweden", "Switzerland", "Tajikistan",
"Macedonia, The former Yugoslav Republic of", "Turkey", "Turkmenistan", "Ukraine",
"United Kingdom", "Uzbekistan",
"Ã…land Islands"]
who_regions["Latin America and the Caribbean"] = ["Anguilla", "Antigua and Barbuda", "Argentina", "Aruba",
"Bahamas", "Barbados", "Belize", "Bolivia", "Bonaire",
"Bouvet Island", "Brazil", "British Virgin Islands",
"Cayman Islands", "Chile", "Colombia", "Costa Rica", "Cuba",
"Curaçao", "Dominica", "Dominican Republic", "Ecuador",
"El Salvador", "Falkland Islands (Malvinas)", "French Guiana",
"Grenada", "Guadeloupe", "Guatemala", "Guyana", "Haiti",
"Honduras", "Jamaica", "Martinique", "Mexico", "Montserrat",
"Nicaragua", "Panama", "Paraguay", "Peru", "Puerto Rico",
"Saint BarthÕ©lemy", "Saint Kitts and Nevis", "Saint Lucia",
"Saint Martin", "Saint Vincent and the Grenadines",
"Sint Maarten", "South Georgia and the South Sandwich Islands",
"Suriname", "Trinidad And Tobago", "Turks and Caicos Islands",
"Virgin Islands", "Uruguay", "Venezuela"]
who_regions["North America"] = ["Bermuda", "Canada", "Greenland", "Saint Pierre and Miquelon",
"United States of America"]
who_regions["South-East Asia"] = ["Bangladesh", "Bhutan", "Democratic People's Republic of Korea", "India",
"Indonesia", "Maldives", "Myanmar", "Nepal", "Sri Lanka", "Thailand",
"Timor-Leste"]
who_regions["Western Pacific"] = ["American Samoa", "Australia", "Brunei", "Cambodia", "China",
"Hong Kong", "Macao", "Taiwan", "Christmas Island", "Cocos (Keeling) Islands",
"Cook Islands", "Fiji", "French Polynesia", "Guam",
"Heard Island and McDonald Islands", "Japan", "Kiribati",
"Laos", "Malaysia", "Marshall Islands", "Micronesia", "Mongolia", "Nauru",
"New Caledonia", "New Zealand", "Niue", "Norfolk Island",
"Northern Mariana Islands", "Palau", "Papua New Guinea", "Philippines",
"Pitcairn", "Korea, Republic of", "Samoa", "Singapore", "Solomon Islands",
"Tokelau", "Tonga", "Tuvalu", "Minor Outlying Islands", "Vanuatu",
"Viet Nam", "Wallis and Futuna Islands"]
def get_who_region(country):
for region in who_regions:
if country in who_regions[region]:
return region
if "..." in country:
abrev_country_name = re.search(r'(?<=^)[^\.]+', country)[0]
for region in who_regions:
for c in who_regions[region]:
if re.match(abrev_country_name, c):
return region
print("Country not found among WHO regions: %s" % country)
return False
selected_regions_years = {}
for region in who_regions.keys():
selected_regions_years[region] = {}
for country in selected_countries_years.keys():
region = get_who_region(country)
if region:
for year in selected_countries_years[country].keys():
selected_regions_years[region][year] = selected_regions_years[region].get(year, 0) + selected_countries_years[country][year]
from nltk.tokenize import MWETokenizer
def get_context(index, wordlist):
lowest_index = max(0, index-25)
highest_index = min(index+1+25, len(wordlist))
return wordlist[lowest_index:index] + wordlist[index+1:highest_index]
tokenizer = MWETokenizer(compound_terms)
regex = re.compile(r'^.{1,3}$') #words with 3 or less chars
types_count = {}
tokens_count = 0
per_sector = {}
average_per_sector = {}
proportion_per_sector = {}
per_country = {}
per_country_focusyear = {}
average_per_country_focusyear = {}
proportion_per_country_focusyear = {}
per_year = {}
average_per_year = {}
proportion_per_year = {}
per_region = {}
per_region_year_intersection = {}
average_per_region_year_intersection = {}
proportion_per_region_year_intersection = {}
histogram_number_of_mentions = {}
histogram_tokens_count = []
global_count_health_keywords = {}
global_health_contexts = []
global_count_climate_keywords = {}
global_climate_contexts = []
global_intersection_contexts = []
cooccurrence_matrix = {}
for termset in ["health", "climate", "intersection"]:
per_sector[termset] = {}
average_per_sector[termset] = {}
proportion_per_sector[termset] = {}
per_country[termset] = {}
per_country_focusyear[termset] = {}
average_per_country_focusyear[termset] = {}
proportion_per_country_focusyear[termset] = {}
per_year[termset] = {}
average_per_year[termset] = {}
proportion_per_year[termset] = {}
per_region[termset] = {}
histogram_number_of_mentions[termset] = {}
for region in who_regions.keys():
per_region_year_intersection[region] = {}
average_per_region_year_intersection[region] = {}
proportion_per_region_year_intersection[region] = {}
for pdf in selected_pdfs.keys():
filenametxt = txts_folder + filenameregex.search(pdf)[0] + ".txt"
print("Loading %s" % (filenametxt))
try:
txtFileObj = open(filenametxt, 'r')
except:
continue
wordlist = re.split(r'[\W0-9]+', txtFileObj.read().lower())
tokens_count += len(wordlist)
histogram_tokens_count.append(len(wordlist))
for word in wordlist:
types_count[word] = types_count.get(word, 0) + 1
compounds_wordlist = tokenizer.tokenize(wordlist)
filtered_compounds_wordlist = [w for w in compounds_wordlist if (len(w) > 3)]
health_contexts = []
climate_contexts = []
health_words = []
for i in range(0,len(filtered_compounds_wordlist)):
word = filtered_compounds_wordlist[i]
if word in health_dict:
context = get_context(i, filtered_compounds_wordlist)
health_contexts.append(context)
health_words.append(word)
global_count_health_keywords[word] = global_count_health_keywords.get(word, 0) + 1
global_health_contexts.extend(context)
if word in climate_dict:
context = get_context(i, filtered_compounds_wordlist)
climate_contexts.append(context)
global_count_climate_keywords[word] = global_count_climate_keywords.get(word, 0) + 1
global_climate_contexts.extend(context)
total_health_mentions = len(health_contexts)
total_climate_mentions = len(climate_contexts)
total_intersection_mentions = 0
for i in range(0, len(health_contexts)):
mention = health_contexts[i]
hword = health_words[i]
if hword not in cooccurrence_matrix.keys():
cooccurrence_matrix[hword] = {}
for cword in climate_dict:
if cword in mention:
total_intersection_mentions += 1
global_intersection_contexts.extend(mention)
cooccurrence_matrix[hword][cword] = cooccurrence_matrix[hword].get(cword, 0) + 1
language = selected_pdfs[pdf]["language"]
year = selected_pdfs[pdf]["year"]
country = selected_pdfs[pdf]["country"]
sector = selected_pdfs[pdf]["sector"]
region = get_who_region(country)
histogram_number_of_mentions["health"][total_health_mentions] = histogram_number_of_mentions["health"].get(total_health_mentions, 0) + 1
histogram_number_of_mentions["climate"][total_climate_mentions] = histogram_number_of_mentions["climate"].get(total_climate_mentions, 0) + 1
histogram_number_of_mentions["intersection"][total_intersection_mentions] = histogram_number_of_mentions["intersection"].get(total_intersection_mentions, 0) + 1
per_sector["health"][sector] = per_sector["health"].get(sector,0) + total_health_mentions
per_sector["climate"][sector] = per_sector["climate"].get(sector,0) + total_climate_mentions
per_sector["intersection"][sector] = per_sector["intersection"].get(sector,0) + total_intersection_mentions
per_country["health"][country] = per_country["health"].get(country,0) + total_health_mentions
per_country["climate"][country] = per_country["climate"].get(country,0) + total_climate_mentions
per_country["intersection"][country] = per_country["intersection"].get(country,0) + total_intersection_mentions
if year == focus_year:
per_country_focusyear["health"][country] = per_country_focusyear["health"].get(country,0) + total_health_mentions
per_country_focusyear["climate"][country] = per_country_focusyear["climate"].get(country,0) + total_climate_mentions
per_country_focusyear["intersection"][country] = per_country_focusyear["intersection"].get(country,0) + total_intersection_mentions
if region != False:
per_region["health"][region] = per_region["health"].get(region,0) + total_health_mentions
per_region["climate"][region] = per_region["climate"].get(region,0) + total_climate_mentions
per_region["intersection"][region] = per_region["intersection"].get(region,0) + total_intersection_mentions
per_region_year_intersection[region][year] = per_region_year_intersection[region].get(year,0) + total_intersection_mentions
per_year["health"][year] = per_year["health"].get(year,0) + total_health_mentions
per_year["climate"][year] = per_year["climate"].get(year,0) + total_climate_mentions
per_year["intersection"][year] = per_year["intersection"].get(year,0) + total_intersection_mentions
if total_health_mentions > 0:
proportion_per_year["health"][year] = proportion_per_year["health"].get(year,0) + 1
proportion_per_sector["health"][sector] = proportion_per_sector["health"].get(sector,0) + 1
if year == focus_year:
proportion_per_country_focusyear["health"][country] = proportion_per_country_focusyear["health"].get(country,0) + 1
if total_climate_mentions > 0:
proportion_per_year["climate"][year] = proportion_per_year["climate"].get(year,0) + 1
proportion_per_sector["climate"][sector] = proportion_per_sector["climate"].get(sector,0) + 1
if year == focus_year:
proportion_per_country_focusyear["climate"][country] = proportion_per_country_focusyear["climate"].get(country,0) + 1
if total_intersection_mentions > 0:
proportion_per_year["intersection"][year] = proportion_per_year["intersection"].get(year,0) + 1
proportion_per_sector["intersection"][sector] = proportion_per_sector["intersection"].get(sector,0) + 1
if year == focus_year:
proportion_per_country_focusyear["intersection"][country] = proportion_per_country_focusyear["intersection"].get(country,0) + 1
proportion_per_region_year_intersection[region][year] = proportion_per_region_year_intersection[region].get(year,0) + 1
for year in selected_years.keys():
average_per_year["health"][year] = per_year["health"][year]/selected_years[year]
average_per_year["climate"][year] = per_year["climate"][year]/selected_years[year]
average_per_year["intersection"][year] = per_year["intersection"][year]/selected_years[year]
proportion_per_year["health"][year] = proportion_per_year["health"].get(year,0)/selected_years[year] * 100
proportion_per_year["climate"][year] = proportion_per_year["climate"].get(year,0)/selected_years[year] * 100
proportion_per_year["intersection"][year] = proportion_per_year["intersection"].get(year,0)/selected_years[year] * 100
for country in selected_countries_years.keys():
if focus_year in selected_countries_years[country].keys():
average_per_country_focusyear["health"][country] = per_country_focusyear["health"].get(country, 0)/selected_countries_years[country][focus_year]
average_per_country_focusyear["climate"][country] = per_country_focusyear["climate"].get(country, 0)/selected_countries_years[country][focus_year]
average_per_country_focusyear["intersection"][country] = per_country_focusyear["intersection"].get(country, 0)/selected_countries_years[country][focus_year]
proportion_per_country_focusyear["health"][country] = proportion_per_country_focusyear["health"].get(country,0)/selected_countries_years[country][focus_year] * 100
proportion_per_country_focusyear["climate"][country] = proportion_per_country_focusyear["climate"].get(country,0)/selected_countries_years[country][focus_year] * 100
proportion_per_country_focusyear["intersection"][country] = proportion_per_country_focusyear["intersection"].get(country,0)/selected_countries_years[country][focus_year] * 100
for region in selected_regions_years.keys():
for year in selected_regions_years[region].keys():
average_per_region_year_intersection[region][year] = per_region_year_intersection[region].get(year,0)/selected_regions_years[region][year]
proportion_per_region_year_intersection[region][year] = proportion_per_region_year_intersection[region].get(year,0)/selected_regions_years[region][year] * 100
for sector in selected_sectors.keys():
average_per_sector["health"][sector] = per_sector["health"][sector]/selected_sectors[sector]
average_per_sector["climate"][sector] = per_sector["climate"][sector]/selected_sectors[sector]
average_per_sector["intersection"][sector] = per_sector["intersection"][sector]/selected_sectors[sector]
proportion_per_sector["health"][sector] = proportion_per_sector["health"].get(sector,0)/selected_sectors[sector] * 100
proportion_per_sector["climate"][sector] = proportion_per_sector["climate"].get(sector,0)/selected_sectors[sector] * 100
proportion_per_sector["intersection"][sector] = proportion_per_sector["intersection"].get(sector,0)/selected_sectors[sector] * 100
print("Number of tokens: %d" % tokens_count)
print("Number of types: %d" % len(types_count.keys()))
print("Average number of tokens per report: %d" % int(tokens_count/len(selected_pdfs.keys())))
print("Median number of tokens per report: %d" % sorted(histogram_tokens_count)[int(len(selected_pdfs.keys())/2)])
Each bar corresponds to number of reports with x number of mentions
df_histogram_number_of_mentions = pd.DataFrame(data=histogram_number_of_mentions)
df_histogram_number_of_mentions.filter(items=list(range(0,100)),axis=0).plot.bar(logy=True, figsize=(20,5))
df_per_year = pd.DataFrame(data=per_year)
df_per_year
ax = df_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
ax = df_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["climate","health"],axis=1).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
ax = df_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["intersection"],axis=1).plot.line(figsize=(15,5), color="green", legend=False)
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
df_per_region_year_intersection = pd.DataFrame(data=per_region_year_intersection)
ax = df_per_region_year_intersection.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
df_average_per_year = pd.DataFrame(data=average_per_year)
df_average_per_year
ax = df_average_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
ax = df_average_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["climate","health"],axis=1).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
ax = df_average_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["intersection"],axis=1).plot.line(figsize=(15,5), color="green", legend=False)
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
df_average_per_region_year_intersection = pd.DataFrame(data=average_per_region_year_intersection)
ax = df_average_per_region_year_intersection.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
df_proportion_per_year = pd.DataFrame(data=proportion_per_year)
df_proportion_per_year
ax = df_proportion_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
ax = df_proportion_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["climate","health"],axis=1).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
ax = df_proportion_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["intersection"],axis=1).plot.line(figsize=(15,5), color="green", legend=False)
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
df_proportion_per_region_year_intersection = pd.DataFrame(data=proportion_per_region_year_intersection)
ax = df_proportion_per_region_year_intersection.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
df_per_sector = pd.DataFrame(data=per_sector)
df_per_sector
ax = df_per_sector.loc[(df_per_sector['climate'] > 0) | (df_per_sector['health'] > 0)].plot.bar(stacked=True,figsize=(15,5))
ax.set_xlabel("Sector")
ax.set_ylabel("Total number of references")
ax
df_average_per_sector = pd.DataFrame(data=average_per_sector)
df_average_per_sector
ax = df_average_per_sector.loc[(df_average_per_sector['climate'] > 0) | (df_average_per_sector['health'] > 0)].plot.bar(stacked=True,figsize=(20,5))
ax.set_xlabel("Sector")
ax.set_ylabel("Average number of references")
ax
df_proportion_per_sector = pd.DataFrame(data=proportion_per_sector)
df_proportion_per_sector
ax = df_proportion_per_sector.loc[(df_proportion_per_sector['climate'] > 0) | (df_proportion_per_sector['health'] > 0)].plot.bar(stacked=False,figsize=(20,5), title="Proportion of reports with mentions per sector")
ax.set_xlabel("Sector")
ax.set_ylabel("Proportion of reports (%)")
ax
df_per_country = pd.DataFrame(data=per_country)
df_per_country.loc[(df_per_country['climate'] > 0) | (df_per_country['health'] > 0)]
ax = df_per_country.loc[(df_per_country['climate'] > 0) | (df_per_country['health'] > 0)].plot.bar(stacked=True,figsize=(20,10))
ax.set_xlabel("Country")
ax.set_ylabel("Total number of references")
ax
df_per_region = pd.DataFrame(data=per_region)
df_per_region
ax = df_per_region.plot.bar(stacked=True,figsize=(20,10))
ax.set_xlabel("WHO Region")
ax.set_ylabel("Total number of references")
ax
import matplotlib.pyplot as plt
import matplotlib.cm
import numpy as np
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import Normalize
def create_map(per_country_counts, title, resolution='c'):
mapped_country_names = {'Bosnia-Herze...' : 'Bosnia and Herz.', "Cote d'Ivoire" : "Côte d'Ivoire",
'Congo, Democ...' : 'Dem. Rep. Congo', 'Dominican Re...' : 'Dominican Rep.',
'Iran, Islami...' : 'Iran', 'Macedonia, T...' : 'Macedonia',
'Moldova, Rep...' : 'Moldova', 'Palestine, S...' : 'Palestine',
'Russian Fede...' : 'Russia', 'South Sudan' : 'S. Sudan',
'Korea, Repub...' : 'South Korea' , 'Syrian Arab ...' : 'Syria',
'Tanzania, Un...' : 'Tanzania', 'Trinidad And...' : 'Trinidad and Tobago',
'United Arab ...' : 'United Arab Emirates', 'United State...' : 'United States of America',
'Viet Nam' : 'Vietnam'}
per_normalised_country_counts = []
for country, count in per_country_counts.items():
if country in mapped_country_names:
per_normalised_country_counts.append((mapped_country_names[country], count))
else:
per_normalised_country_counts.append((country, count))
df_per_normalised_country_counts = pd.DataFrame(per_normalised_country_counts, columns=["country", "count"])
fig, ax = plt.subplots(figsize=(30,20))
plt.title(title)
m = Basemap(resolution=resolution, # c, l, i, h, f or None
projection='cyl',
lat_0=0, lon_0=0,
llcrnrlon=-170, llcrnrlat=-60, urcrnrlon=190, urcrnrlat=84)
m.drawmapboundary(fill_color='#aaccec')
m.fillcontinents(color='#f2f2f2',lake_color='#aaccec')
m.drawcoastlines()
m.readshapefile("ne_110m_admin_0_countries", "countries")
shapename_regex = re.compile(r'(?<=^)[\w \.\']+')
df_poly = pd.DataFrame({
'shapes': [Polygon(np.array(shape), True) for shape in m.countries],
'country': [shapename_regex.search(area['NAME'])[0] for area in m.countries_info]
})
df_poly = df_poly.merge(df_per_normalised_country_counts, on='country', how='left', validate="many_to_one")
cmap = plt.get_cmap('YlOrRd')
pc = PatchCollection(df_poly.shapes, zorder=2)
norm = Normalize()
pc.set_facecolor(cmap(norm(df_poly['count'].fillna(0).values)))
ax.add_collection(pc)
mapper = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap)
mapper.set_array(df_poly['count'])
plt.colorbar(mapper, shrink=0.5)
create_map(per_country_focusyear["health"], "2017 - Global Compact reports - Health - Total number of references")
create_map(per_country_focusyear["climate"], "2017 - Global Compact reports - Climate - Total number of references")
create_map(per_country_focusyear["intersection"], "2017 - Global Compact reports - Health&Climate intersection - Total number of references")
create_map(average_per_country_focusyear["health"], "2017 - Global Compact reports - Health - Average number of references")
create_map(average_per_country_focusyear["climate"], "2017 - Global Compact reports - Climate - Average number of references")
create_map(average_per_country_focusyear["intersection"], "2017 - Global Compact reports - Health&Climate intersection - Average number of references")
create_map(proportion_per_country_focusyear["health"], "2017 - Global Compact reports - Health - Proportion of reports")
create_map(proportion_per_country_focusyear["climate"], "2017 - Global Compact reports - Climate - Proportion of reports")
create_map(proportion_per_country_focusyear["intersection"], "2017 - Global Compact reports - Health&Climate intersection - Proportion of reports")
df_health_keywords = pd.DataFrame(sorted(global_count_health_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_health_keywords
df_climate_keywords = pd.DataFrame(sorted(global_count_climate_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_climate_keywords
Up to 200 most frequent words that appear in the context of our health or climate keywords or both
import collections
from wordcloud import WordCloud, STOPWORDS
from stop_words import get_stop_words
threshold = 200
language_specific_stopwords = get_stop_words(focus_language)
def create_wordcloud(contexts, stopwords=[]):
most_frequent_words = {}
stopwords.extend(STOPWORDS)
stopwords.extend(language_specific_stopwords)
context_unigrams = collections.Counter(contexts)
for word, freq in sorted(context_unigrams.items(), key=lambda k: k[1], reverse=True)[0:threshold]:
if word not in stopwords:
most_frequent_words[word] = freq
wordcloud = WordCloud(background_color="white", scale=10).generate_from_frequencies(most_frequent_words)
fig = plt.figure(1, figsize=(20, 12))
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()
create_wordcloud(global_health_contexts, health_dict)
create_wordcloud(global_climate_contexts, climate_dict)
create_wordcloud(global_intersection_contexts, climate_dict + health_dict)
Blue nodes are health keywords and green nodes are climate keywords. The closer the nodes are to each other, the more often the words co-occur. The graph only shows links between words that co-occurred 10 or more times in the whole corpus.
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
G = nx.Graph()
health_nodes = []
climate_nodes = []
for word1 in cooccurrence_matrix.keys():
for word2 in cooccurrence_matrix[word1].keys():
if cooccurrence_matrix[word1][word2] >= language_ref[focus_language]['min_coocurrence']:
G.add_edge(word1, word2, weight=cooccurrence_matrix[word1][word2])
health_nodes.append(word1)
climate_nodes.append(word2)
plt.figure(figsize=(15,15))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=health_nodes, node_size=1000, node_color='b')
nx.draw_networkx_nodes(G, pos, nodelist=climate_nodes, node_size=1000, node_color='g')
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')
#edge_labels = nx.get_edge_attributes(G,'weight')
#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10, font_family='sans-serif')
plt.axis('off')
plt.show()